Functions

# Load functions
source(here("functions.R"))

Overview

Cohort characteristics

data %>% 
  select(age, female, non_white, educ_years, bmi, evertb, xray_cavit, smear_pos, diabetes_yn, hiv, lab_hgb, dishx_any_minus, alcoholhx, drughx, smokhx) %>% 
  tbl_summary(missing="no") %>% 
  add_n() %>% 
  bold_labels()
Characteristic N N = 9441
age 944 35 (25, 49)
female 944 317 (34%)
non_white 943 750 (80%)
educ_years 943 9.0 (6.0, 12.0)
bmi 944 20.2 (18.3, 22.5)
evertb 935 142 (15%)
xray_cavit 937 468 (50%)
smear_pos 943 768 (81%)
diabetes_yn 944 235 (25%)
hiv 938 182 (19%)
lab_hgb 939 12.10 (10.70, 13.30)
dishx_any_minus 944 143 (15%)
alcoholhx 944
Never 152 (16%)
Former 366 (39%)
Current 426 (45%)
drughx 943
Never 623 (66%)
Former 203 (22%)
Current 117 (12%)
smokhx 944
Never 455 (48%)
Former 273 (29%)
Current 216 (23%)

1 Statistics presented: median (IQR); n (%)

HIV-related characteristics

data %>% 
  filter(hiv==1) %>% 
  select(art_experienced, cd4_count, cd4_lt200_all, viral_load, vl_gt200_all) %>% 
  tbl_summary(missing="no") %>% 
  add_n() %>% 
  bold_labels()
Characteristic N N = 1821
art_experienced 182 71 (39%)
cd4_count 175 126 (54, 274)
cd4_lt200_all 175
HIV negative 0 (0%)
CD4 >=200 59 (34%)
CD4 <200 116 (66%)
viral_load 169 32183 (309, 205226)
vl_gt200_all 169
HIV negative 0 (0%)
VL <200 39 (23%)
VL >=200 130 (77%)

1 Statistics presented: n (%); median (IQR)

Treatment outcomes

data %>% 
  select(outcome_cat, unsuccessful, tto_outcome) %>% 
  tbl_summary(by=unsuccessful) %>% 
  add_n() %>% 
  bold_labels() %>% 
  add_overall(last=TRUE)
Characteristic N 0, N = 7531 1, N = 1911 Overall, N = 944
outcome_cat 944
Cure 386 (51%) 0 (0%) 386 (41%)
Treatment complete 367 (49%) 0 (0%) 367 (39%)
Death 0 (0%) 29 (15%) 29 (3.1%)
Treatment failure 0 (0%) 29 (15%) 29 (3.1%)
Regimen switch 0 (0%) 14 (7.3%) 14 (1.5%)
Treatment incomplete 0 (0%) 56 (29%) 56 (5.9%)
Not evaluated 0 (0%) 63 (33%) 63 (6.7%)
tto_outcome 944 188 (182, 212) 107 (41, 178) 186 (179, 205)

1 Statistics presented: n (%); median (IQR)

Timing of outcomes

Time to outcome is defined as the time between treatment initiation and event. If participants abandoned treatment or were lost to follow-up, their duration of follow-up is calculated as the midpoint of their last attended and first missed visits.

data %>% 
  select(outcome_cat, tto_outcome) %>% 
    tbl_summary(by=outcome_cat, missing="no") %>% 
    add_n() %>% 
    bold_labels()
Characteristic N Cure, N = 3861 Treatment complete, N = 3671 Death, N = 291 Treatment failure, N = 291 Regimen switch, N = 141 Treatment incomplete, N = 561 Not evaluated, N = 631
tto_outcome 944 187 (181, 203) 189 (183, 264) 32 (10, 99) 186 (179, 219) 35 (21, 47) 99 (69, 150) 107 (41, 131)

1 Statistics presented: median (IQR)

Missing data

# Model variables
model_vars <- Cs(unsuccessful, hiv, diabetes_yn, lab_hgb, evertb, xray_cavit, smear_pos, age, female, bmi, dishx_any_minus, educ_years, alcoholhx, drughx, smokhx, non_white)
# Analysis dataset = dataset with all model variables and ID 
analysis <- data %>% 
  select(all_of(model_vars), subjid) 

# ID for participants with at least 1 missing value
missing_data <- analysis %>% 
  filter_all(any_vars(is.na(.))) %>% 
  pull(subjid) 

# Dataset to match ID after imputation
matches <- data %>% select(subjid, unsuccessful, outcome_cat, tto_outcome, tto_outcome_min, tto_outcome_max) %>% 
 rowid_to_column("id") %>% 
  mutate(complete_case = ifelse(subjid %in% missing_data, 0, 1))

Plots

# Missing data patterns
miss_patterns <- naclus(analysis)

plot(miss_patterns)

naplot(miss_patterns)

No concerning patterns of missingness

Redundancy analysis

length_model_vars <- length(model_vars)

redun_form <- as.formula(paste0("~", (paste(model_vars[2:length_model_vars], collapse = "+"))))

redun(redun_form, r2=0.8, data=analysis, type='adjusted', nk=3, pr=FALSE)

Redundancy Analysis

redun(formula = redun_form, data = analysis, r2 = 0.8, type = "adjusted", 
    nk = 3, pr = FALSE)

n: 914  p: 15   nk: 3 

Number of NAs:   30 
Frequencies of Missing Values Due to Each Variable
            hiv     diabetes_yn         lab_hgb          evertb      xray_cavit 
              6               0               5               9               7 
      smear_pos             age          female             bmi dishx_any_minus 
              1               0               0               0               0 
     educ_years       alcoholhx          drughx          smokhx       non_white 
              1               0               1               0               1 


Transformation of target variables forced to be linear

R-squared cutoff: 0.8   Type: adjusted 

R^2 with which each variable can be predicted from all other variables:

            hiv     diabetes_yn         lab_hgb          evertb      xray_cavit 
          0.284           0.132           0.175           0.024           0.107 
      smear_pos             age          female             bmi dishx_any_minus 
          0.060           0.373           0.154           0.141           0.127 
     educ_years       alcoholhx          drughx          smokhx       non_white 
          0.159           0.230           0.360           0.299           0.059 

No redundant variables

The redun function allows non-monotonic transformations for each predictor to assess its ability to predict all other variables. It also shows a simple print out of the number of missing for each variable.

Even with a relaxed the \(R^2\) cut off to identify redundancy to 0.8, there are no redundancies.

Variable clustering

vc <- varclus(redun_form,  data=analysis, sim='hoeffding')

plot(vc)

No variables are clustered, so I cannot narrow the variable pool this way.

Imputation

# Number of complete cases 
tab(complete.cases(analysis))

FALSE  TRUE 
   30   914 
# Vector the variable names with any missingn values
vars_with_missing <- names(which(sapply(analysis, anyNA)))

In this dataset there are 914 complete cases, which is 96.82% of the data.

Here, I carry out imputation using the mice package

# 10 iterations
mi <- mice(analysis[,2:length_model_vars], m=10 , maxit=5, seed=352, printFlag = FALSE)
# Summary of imputation
summary(mi)
Class: mids
Number of multiple imputations:  10 
Imputation methods:
            hiv     diabetes_yn         lab_hgb          evertb      xray_cavit 
          "pmm"              ""           "pmm"           "pmm"           "pmm" 
      smear_pos             age          female             bmi dishx_any_minus 
          "pmm"              ""              ""              ""              "" 
     educ_years       alcoholhx          drughx          smokhx       non_white 
          "pmm"              ""       "polyreg"              ""           "pmm" 
PredictorMatrix:
            hiv diabetes_yn lab_hgb evertb xray_cavit smear_pos age female bmi
hiv           0           1       1      1          1         1   1      1   1
diabetes_yn   1           0       1      1          1         1   1      1   1
lab_hgb       1           1       0      1          1         1   1      1   1
evertb        1           1       1      0          1         1   1      1   1
xray_cavit    1           1       1      1          0         1   1      1   1
smear_pos     1           1       1      1          1         0   1      1   1
            dishx_any_minus educ_years alcoholhx drughx smokhx non_white
hiv                       1          1         1      1      1         1
diabetes_yn               1          1         1      1      1         1
lab_hgb                   1          1         1      1      1         1
evertb                    1          1         1      1      1         1
xray_cavit                1          1         1      1      1         1
smear_pos                 1          1         1      1      1         1

Plots

The plots below show the distribution of imputed values across the imputed datasets

densityplot(mi)
stripplot(mi)

Analysis dataset

I use measures of central tendency to group by id and summarize the most mode of categorical variables and the median of continuous variables. Using this completed dataset will not account for uncertainty in the analysis, but I plan to do sensitivity analyses comparing results using imputation.

# Stacked dataset without original data
mi_long <- complete(mi, "long", include=TRUE) %>% 
  filter(.imp!=0) %>% 
  rename(id=.id) 

# Take median of numeric variables
mi_sum_numeric <- mi_long %>%  
  group_by(id) %>% 
  summarise(across(where(is.numeric), ~ median(.x, na.rm = TRUE)))

# Take mode of factor variables
mi_sum_factor <- mi_long %>%
  group_by(id) %>%
  summarise(across(where(is.factor), ~ mode(.x))) 

# Combine datasets
mi_sum <- merge(mi_sum_factor, mi_sum_numeric, by="id") %>% 
  select(!.imp) %>% 
  left_join(matches, by="id") # Add outcomes and ID back

mi_long <- left_join(mi_long, matches, by="id")
# Save datasets
save(mi_sum, file="/Users/Lauren/Box/RePORT Brazil/Datasets/Save/imputed_analysis_data.Rdata")
save(mi_long, file="/Users/Lauren/Box/RePORT Brazil/Datasets/Save/imputed_analysis_data_long.Rdata")

Main analysis

# ----- SETUP -----

# Set data distribution
dd <- datadist (mi_sum)
options(datadist="dd")

# Seed for all analyses
SEED = 1408

# Save outcome vector 
outcome_vector <- mi_sum$unsuccessful

Test for non-linearity

# Model formula with restricted cubic splines 
model_form_rcs <- as.formula(unsuccessful ~ hiv + diabetes_yn + rcs(lab_hgb,4) + evertb + xray_cavit + smear_pos + rcs(age,4) + female + rcs(bmi,4) + dishx_any_minus + rcs(educ_years,4) + alcoholhx + drughx + smokhx + non_white)

rcs_model_si <- lrm(model_form_rcs, data=mi_sum, x=TRUE, y=TRUE)

plot(Predict(rcs_model_si, lab_hgb))
plot(Predict(rcs_model_si, age))
plot(Predict(rcs_model_si, bmi))
plot(Predict(rcs_model_si, educ_years))

anova(rcs_model_si)
                Wald Statistics          Response: unsuccessful 

 Factor          Chi-Square d.f. P     
 hiv              10.10      1   0.0015
 diabetes_yn      10.07      1   0.0015
 lab_hgb          12.19      3   0.0068
  Nonlinear        0.84      2   0.6578
 evertb            0.78      1   0.3761
 xray_cavit        0.00      1   0.9676
 smear_pos         2.22      1   0.1366
 age               8.63      3   0.0347
  Nonlinear        6.86      2   0.0325
 female            2.02      1   0.1551
 bmi               2.68      3   0.4434
  Nonlinear        0.19      2   0.9086
 dishx_any_minus   0.47      1   0.4910
 educ_years        7.15      3   0.0674
  Nonlinear        1.72      2   0.4241
 alcoholhx         0.29      2   0.8661
 drughx           13.55      2   0.0011
 smokhx            5.65      2   0.0593
 non_white         1.95      1   0.1622
 TOTAL NONLINEAR   9.29      8   0.3182
 TOTAL           116.35     26   <.0001

Age needs non-linear term. Main analysis will use five age groups, but sensitivity analyses will use spline to compare results.

Full model

# Create age groups
mi_sum <- mi_sum %>% 
  mutate(age_group = fct_case_when(
    age < 25 ~ "<25",
    age >= 25 & age <35 ~ "25-35",
    age >=35 & age <45 ~ "35-45",
    age >=45 & age <55 ~ "45-55",
    age >= 55 ~ "55+"
  ))


# Model variables with outcome variable first
model_vars <- Cs(unsuccessful, hiv, diabetes_yn, lab_hgb, evertb, xray_cavit, smear_pos, age_group, female, bmi, dishx_any_minus, educ_years, alcoholhx, drughx, smokhx, non_white)

# Length vector for formulas later
length_model_vars <- length(model_vars)

# Full model formula 
model_formula <- as.formula(paste0("unsuccessful ~", (paste(model_vars[2:length_model_vars], collapse = "+"))))
# Full model 
full_model <- lrm(model_formula, data = mi_sum, x = T, y = T)

# Save coefficients
coef_full_model <- full_model$coefficients

# Predicted values from full model
predvals_full_model <- predict(full_model)

# Validation of full model
full_val <- validate(full_model, B=200, seed=SEED)

Performance

Brier score, calibration slope, and calibration intercept of the full model

perf_full_model <- save_val_results(full_val)
perf_full_model
                       [,1]
Brier score           0.137
Calibration slope     1.000
Calibration intercept 0.000

CI for apparent c-statistic

c_full_model <- c_ci(predvals_full_model, outcome_vector)
c_full_model
     c_stat    LB    UB
[1,]  0.779 0.744 0.812

Internal validation

95% CI for the c-statistic internally validated:

c_stat_ci(full_model, data=mi_sum, samples=2000)
                 2.5% 97.5%
orig      0.779 0.744 0.814
corrected 0.745 0.710 0.779

More validation measures

full_val
          index.orig training    test optimism index.corrected   n
Dxy           0.5571   0.5879  0.5180   0.0699          0.4872 200
R2            0.2276   0.2592  0.1993   0.0598          0.1678 200
Intercept     0.0000   0.0000 -0.1773   0.1773         -0.1773 200
Slope         1.0000   1.0000  0.8364   0.1636          0.8364 200
Emax          0.0000   0.0000  0.0729   0.0729          0.0729 200
D             0.1550   0.1786  0.1342   0.0444          0.1106 200
U            -0.0021  -0.0021  0.0045  -0.0066          0.0045 200
Q             0.1571   0.1807  0.1297   0.0510          0.1061 200
B             0.1367   0.1318  0.1407  -0.0089          0.1456 200
g             1.2213   1.3629  1.1311   0.2318          0.9896 200
gp            0.1713   0.1822  0.1605   0.0217          0.1496 200

Bootstrapped backwards selection

The model selection function below was adapted from code provided by Heinze et al. It first fits the full model, then fits one round of backward selection, and then uses 500 bootstrap samples (sampling with replacement) to estimate selected models and then takes the median coefficient value across those samples.

Variable selection adds uncertainty to regression coefficients, which is quantified by the root mean square difference ratios. Additionally, model based uncertainty of the selected model are falsely precise and do not consider backwards selection.

RMSD ratio (RMSD of bootstrap estimates are divided by the standard error of that coefficient in the global model) quantify the uncertainty of variable selection, and expresses variance inflation or deflation caused by variable selection.

The relative conditional bias estimate quantifies how much variable-selection-induced bias one would have to expect if an IV is selected, and it increases with the uncertainty of selecting each predictor. It is computed as the difference of the mean of sampled regression coefficients computed from those samples where the variable was selected and the global model regression coefficient, divided by the global model regression coefficient. relative conditional bias <10% indicates negligible bias (evidenced by the bootstrap median as close to the global coefficient)

Using bootstrap medians and 95% are corrected for that uncertainty and can be interpreted as 95% CI obtained from sampling-based multi-model inference. The 2.5th and 97.5th percentiles can be interpreted as limits of 95% confidence intervals obtained by sampling-based multi-model inference.

# Set seed
set.seed(SEED)

# Model selection function with 500 repetitions 
main_results <- model_selection(model=model_formula, data=mi_sum, boot=500, return="all", pval_pair=0.01, seed=SEED)

Main results

main_results$overview
Global Model
Selected Model
Bootstrap
Estimate Standard error Estimate Standard error RMSD ratio Bootstrap inclusion frequency (%) Relative conditional bias (%) Median 2.5th percentile 97.5% percentile
(Intercept) 0.922 0.874 0.909 0.834 1.134 100.0 -4.23 0.918 -1.216 2.812
lab_hgb -0.171 0.050 -0.171 0.049 1.184 98.4 6.22 -0.178 -0.292 -0.074
hiv 0.779 0.240 0.780 0.230 1.119 97.0 3.43 0.780 0.000 1.263
drughxFormer 0.399 0.248 0.402 0.247 1.135 96.8 16.00 0.437 -0.045 0.950
drughxCurrent 1.097 0.293 1.124 0.285 1.319 96.8 9.14 1.157 0.000 1.877
diabetes_yn 0.696 0.212 0.690 0.210 1.204 95.6 5.22 0.701 0.000 1.175
age_group25-35 -0.444 0.260 -0.444 0.259 1.161 88.6 11.68 -0.456 -0.999 0.009
age_group35-45 -0.639 0.281 -0.644 0.279 1.248 88.6 12.03 -0.671 -1.280 0.000
age_group45-55 -1.052 0.337 -1.063 0.335 1.477 88.6 11.00 -1.102 -1.874 0.000
age_group55+ -0.357 0.337 -0.433 0.329 1.214 88.6 16.33 -0.353 -1.224 0.342
educ_years -0.056 0.024 -0.055 0.024 1.356 83.0 20.91 -0.060 -0.110 0.000
smokhxFormer 0.580 0.239 0.590 0.229 1.334 82.4 18.60 0.616 0.000 1.074
smokhxCurrent 0.462 0.266 0.484 0.257 1.241 82.4 20.42 0.492 0.000 1.136
female -0.329 0.215 -0.356 0.213 1.256 60.8 51.13 -0.368 -0.780 0.000
bmi -0.045 0.030 -0.043 0.029 1.340 57.4 48.67 -0.048 -0.109 0.000
smear_pos 0.364 0.252 0.370 0.247 1.252 53.0 53.87 0.379 0.000 0.841
non_white 0.351 0.261 0.392 0.257 1.282 52.0 73.55 0.390 0.000 0.991
evertb -0.240 0.259 0.000 0.000 1.139 34.4 131.84 0.000 -0.833 0.000
dishx_any_minus -0.210 0.287 0.000 0.000 1.179 34.2 162.40 0.000 -0.882 0.408
alcoholhxFormer 0.136 0.337 0.000 0.000 0.903 22.4 269.12 0.000 -0.248 0.931
alcoholhxCurrent 0.175 0.338 0.000 0.000 0.945 22.4 249.93 0.000 0.000 1.018
xray_cavit -0.020 0.193 0.000 0.000 0.866 18.0 -348.43 0.000 -0.385 0.414

Frequnecy table

The table below shows the frequency of how often models were selected during the bootstrap sampling. The boot50 model (including variables selected in at least 50% of bootstrap samples) was selected <2% of the time. Tied for most selected model. Shows variability of resampling procedure.

main_results$freq_table
Predictors count percent cum_percent
51 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female smear_pos non_white 9 1.8 1.8
60 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos non_white 9 1.8 3.6
16 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female smear_pos 8 1.6 5.2
33 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female non_white 8 1.6 6.8
44 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi non_white 8 1.6 8.4
5 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female 7 1.4 9.8
25 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos 6 1.2 11.0
107 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos non_white evertb 6 1.2 12.2
125 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female smear_pos dishx_any_minus 6 1.2 13.4
138 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi non_white dishx_any_minus 6 1.2 14.6
7 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi 5 1.0 15.6
12 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi 5 1.0 16.6
20 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi smear_pos 4 0.8 17.4
39 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi non_white 4 0.8 18.2
47 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent smear_pos non_white 4 0.8 19.0
77 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi smear_pos evertb 4 0.8 19.8
89 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent bmi non_white evertb 4 0.8 20.6
106 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ smokhxFormer smokhxCurrent female bmi smear_pos non_white evertb 4 0.8 21.4
173 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos non_white evertb dishx_any_minus 4 0.8 22.2
224 lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi dishx_any_minus alcoholhxFormer alcoholhxCurrent 4 0.8 23.0

Pairwise table

The table below informs us about the variables that are often included together (“rope teams”) or variables that may be substituted for one another (“competitors”).

  • A “>” sign indicates that variables are selected together more often than would be expected by chance by multiplying their individual selection probabilities
  • A “<” sign indicates that the variables were selected together less often than would be expected by chance.
main_results$pair_table
lab_hgb hiv drughxFormer drughxCurrent diabetes_yn age_group25-35 age_group35-45 age_group45-55 age_group55+ educ_years smokhxFormer smokhxCurrent female bmi smear_pos non_white evertb dishx_any_minus alcoholhxFormer alcoholhxCurrent xray_cavit
lab_hgb 98.4 95.4 95.2 95.2 94 87.4 87.4 87.4 87.4 81.6 81 81 60.4 55.8 52.4 50.8 33.6 33.4 22.2 22.2 18
hiv 97 93.8 93.8 92.6 85.6 85.6 85.6 85.6 80.6 80.2 80.2 58.2 56 51.8 50.8 33.4 33.6 21.4 21.4 17.6
drughxFormer 96.8 96.8 92.4 85.4 85.4 85.4 85.4 79.8 79.2 79.2 58.2 55.8 51 50.8 32.8 32.8 21.4 21.4 17.6
drughxCurrent > 96.8 92.4 85.4 85.4 85.4 85.4 79.8 79.2 79.2 58.2 55.8 51 50.8 32.8 32.8 21.4 21.4 17.6
diabetes_yn 95.6 84.8 84.8 84.8 84.8 79.4 79 79 57.6 56 50.6 49.4 32.6 33.2 21.8 21.8 17.6
age_group25-35 88.6 88.6 88.6 88.6 75.4 74.2 74.2 54.8 48.2 47.6 46.6 30 32 19.6 19.6 15
age_group35-45 > 88.6 88.6 88.6 75.4 74.2 74.2 54.8 48.2 47.6 46.6 30 32 19.6 19.6 15
age_group45-55 > > 88.6 88.6 75.4 74.2 74.2 54.8 48.2 47.6 46.6 30 32 19.6 19.6 15
age_group55+ > > > 88.6 75.4 74.2 74.2 54.8 48.2 47.6 46.6 30 32 19.6 19.6 15
educ_years > < < < 83 67.6 67.6 49.8 46 44.4 42.4 29.2 28.4 18.6 18.6 15
smokhxFormer 82.4 82.4 48.8 48.4 43.6 41.6 28.4 28.2 16.8 16.8 14.4
smokhxCurrent > 82.4 48.8 48.4 43.6 41.6 28.4 28.2 16.8 16.8 14.4
female 60.8 33.2 35 31.6 18.2 20.2 10.4 10.4 11.4
bmi > < < < 57.4 29 30.4 20.4 19.2 12 12 10
smear_pos 53 28.4 18.4 17.8 10.2 10.2 10.6
non_white 52 18.2 14 10.6 10.6 8.2
evertb 34.4 12.4 8.2 8.2 6.4
dishx_any_minus < 34.2 7.4 7.4 6.4
alcoholhxFormer < 22.4 22.4 4.6
alcoholhxCurrent < < 22.4 4.6
xray_cavit 18
# Save coefficients from selected model and boot50 model
coef_selected_model <- main_results$raw_overview[,"sel_est"]
coef_boot_median <- main_results$raw_overview[,"boot_median"]

Boot50

The main (“final”) model, based on variables that were selected in at least 50% of bootstrap samples.

The c-statistic of this model is:

# Extract names of variables retained in boot50 model
boot_vars_int <- names(coef_boot_median[which(coef_boot_median!=0)])
boot_vars_int <- gsub("Former|Never|Current|25-35|35-45|45-55|55+", "", boot_vars_int)
boot_vars_int <- unique(boot_vars_int)
boot_vars <- boot_vars_int[-1]

# Boot50 formula
boot_form <- as.formula(paste0("unsuccessful ~", (paste(boot_vars, collapse = "+"))))

# Boot50 model
boot_model <- lrm(boot_form, data = mi_sum, x = T, y = T)

# Boot50 validation
boot_val <- validate(boot_model, B=200, seed=SEED)

# Coefficients from the model refit including only variables from the 50% or more boostraps
coef_boot50_vars <- boot_model$coefficients

Performance

Brier score, calibration slope, and calibration intercept of the full model:

perf_boot50_model <- save_val_results(boot_val)
perf_boot50_model
                       [,1]
Brier score           0.137
Calibration slope     1.000
Calibration intercept 0.000

CI for apparent c-statistic

# Predicted values 
predvals_boot50 <- predict(boot_model)
mi_sum$predvals_boot50 <- predvals_boot50 

c_boot50_model <- c_ci(predvals_boot50, outcome_vector)
c_boot50_model
     c_stat   LB    UB
[1,]  0.778 0.74 0.812

Internal validation

c_stat_ci(boot_model, data=mi_sum, samples=2000)
                 2.5% 97.5%
orig      0.778 0.742 0.812
corrected 0.754 0.720 0.789

Calibration plot

The calibration curve from Frank Harrell’s package is:

# Calibration curve
plot(calibrate(boot_model, B=200), xlab="Predicted Probability of Unsuccessful Outcome", ylab="Actual Proability of Unsuccessful Outcome")

n=944   Mean absolute error=0.022   Mean squared error=0.00138
0.9 Quantile of absolute error=0.031

I also wrote a code that constructs a calibration plot, but does not include bias-corrected line.

# My functions for calibration plot
cal_plot_solo(mi_sum, predvals_boot50, outcome_vector)

More validation measures

boot_val
          index.orig training    test optimism index.corrected   n
Dxy           0.5566   0.5770  0.5271   0.0499          0.5067 200
R2            0.2250   0.2490  0.2041   0.0450          0.1801 200
Intercept     0.0000   0.0000 -0.1420   0.1420         -0.1420 200
Slope         1.0000   1.0000  0.8749   0.1251          0.8749 200
Emax          0.0000   0.0000  0.0560   0.0560          0.0560 200
D             0.1531   0.1716  0.1377   0.0339          0.1192 200
U            -0.0021  -0.0021  0.0026  -0.0047          0.0026 200
Q             0.1552   0.1737  0.1351   0.0386          0.1166 200
B             0.1369   0.1341  0.1400  -0.0060          0.1429 200
g             1.2053   1.3066  1.1365   0.1701          1.0352 200
gp            0.1703   0.1796  0.1623   0.0173          0.1530 200

ROC curve

# Predicted risk
predrisk_boot50 <- 1/(1+exp(-predvals_boot50))
mi_sum$predrisk_boot50 = predrisk_boot50

# ROC curve
roc <- roc(mi_sum, unsuccessful, predrisk_boot50, ci=TRUE)
roc 

Call:
roc.data.frame(data = mi_sum, response = unsuccessful, predictor = predrisk_boot50,     ci = TRUE)

Data: predrisk_boot50 in 753 controls (unsuccessful 0) < 191 cases (unsuccessful 1).
Area under the curve: 0.778
95% CI: 0.743-0.813 (DeLong)
plot(roc)
plot(ci.thresholds(roc), type="shape", col="lightblue")
plot(roc, add=TRUE)

Hosmer-Lemeshow test

hoslem.test(outcome_vector, predrisk_boot50, g=20)

    Hosmer and Lemeshow goodness of fit (GOF) test

data:  outcome_vector, predrisk_boot50
X-squared = 19, df = 18, p-value = 0.4

Shrinkage from boot50 model

Below I use the variables from the boot50 model and model fit to estimate the heuristic shrinkage factor.

A heuristic shrinkage factor is suggested to improve predictions in new patients, and can be estimated as: \[\chi^2_{model} - df / \chi^2_{model}\]

# Heuristic shrinkage factor 
shrinkage_factor <- (boot_model$stats["Model L.R."] - boot_model$stats["d.f."]) / (boot_model$stats["Model L.R."])
shrinkage_factor
Model L.R. 
      0.89 
# Adjust coefficients from boot50 variable model by shrinkage factor
coef_shrink_model <- coef_boot50_vars*shrinkage_factor

# Predicted values and predicted from boot_model*shrinkage factor
predvals_shrink <- predict(boot_model)*shrinkage_factor
predrisk_shrink <- 1/(1+exp(-predvals_shrink))
  
mi_sum$predrisk_shrink = predrisk_shrink
mi_sum$predvals_shrink = predvals_shrink

Performance

Brier score, calibration slope, and calibration intercept of the shrunken model:

shrink_perf <- ev_glm(method="original", lp="predvals_shrink", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")

brier <- shrink_perf["Brier", "apparent"]
slope <- shrink_perf["Slope", "apparent"]
intercept <- shrink_perf["Intercept", "apparent"]

perf_shrink<- rbind("Brier score" = brier, "Calibration slope" = slope, "Calibration intercept" = intercept)
perf_shrink
                       [,1]
Brier score           0.137
Calibration slope     1.124
Calibration intercept 0.000

CI for apparent c-statistic

c_shrink <- c_ci(predvals_shrink, outcome_vector)
c_shrink
     c_stat    LB    UB
[1,]  0.778 0.744 0.812

Calibration plot

cal_plot_solo(mi_sum, predvals_shrink, outcome_vector)

ROC curve

plot(ev_glm(method="original", lp="predvals_shrink", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))

This is similar to the ROC curve from the non-shrunk values (coefficients from boot50 vars model)

Bootstrap medians

This section uses the median coefficients estimated from the bootstrap sampling to evaluate model performance.

## Boot medians
overview <- as.data.frame(main_results$raw_overview) %>% rownames_to_column("variable")

# Select non-zero values
boot_medians <- overview %>% 
  filter(boot_median!=0) %>% 
  select(variable, boot_median)

# Apply formula
mi_sum <- mi_sum %>% 
  mutate(predvals_boot_median = 
           0.9182 +
           -0.1783*lab_hgb +
           0.7803*hiv +
           0.4369*(drughx=="Former") + 
           1.157*(drughx=="Current") +            
           0.7006*diabetes_yn +
           -0.4557*(age_group=="25-35") +
           -0.6709*(age_group=="35-45") +
           -1.1024*(age_group=="45-55") +
           -0.3533*(age_group=="55+") +
           -0.0598*educ_years +    
           0.6162*(smokhx=="Former") +
           0.4924*(smokhx=="Current") +           
           -0.0477*bmi +
           -0.3680*female +
           0.3792*smear_pos + 
           0.3895*non_white,
         predrisk_boot_median = 1/(1+exp(-predvals_boot_median))
  )

mi_sum %>% 
  select(predvals_boot_median, predrisk_boot_median, unsuccessful) %>% 
  tbl_summary(by="unsuccessful")
Characteristic 0, N = 7531 1, N = 1911
predvals_boot_median -2.18 (-2.78, -1.40) -1.04 (-1.68, -0.44)
predrisk_boot_median 0.10 (0.06, 0.20) 0.26 (0.16, 0.39)

1 Statistics presented: median (IQR)

Performance

boot_med_perf <- ev_glm(method="original", lp="predvals_boot_median", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")

brier <- boot_med_perf["Brier", "apparent"]
slope <- boot_med_perf["Slope", "apparent"]
intercept <- boot_med_perf["Intercept", "apparent"]

perf_boot_median <- rbind("Brier score" = brier, "Calibration slope" = slope, "Calibration intercept" = intercept)
perf_boot_median
                       [,1]
Brier score           0.138
Calibration slope     0.966
Calibration intercept 0.123

CI for apparent c-statistic

predvals_boot_median = mi_sum$predvals_boot_median

c_boot_median <- c_ci(predvals_boot_median, outcome_vector)
c_boot_median
     c_stat    LB   UB
[1,]  0.777 0.742 0.81

Calibration plot

cal_plot_solo(mi_sum, predvals_boot_median, outcome_vector)

ROC curve

plot(ev_glm(method="original", lp="predvals_boot_median", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))

LASSO

LASSO with glmnet - lambda minimum and 1se were estimated using 10-fold cross validation.

# Matrix of x variable values
x_vars <- model.matrix(model_formula, mi_sum)[,-1]

# Fit lassso
lasso_fit <- glmnet(x_vars, outcome_vector, family="binomial", alpha=1, seed=SEED)

# Cross validation to select optimal lambda
cv_lasso <- cv.glmnet(x_vars, outcome_vector, family = "binomial", type.measure="auc", nfolds=10, seed=SEED)

plot(cv_lasso)

Lambda min

Using lambda min, almost all variables from the full model were included:

coef_lasso_min <- coef(cv_lasso, s="lambda.min")

# Predicted risk and values
predrisk_lasso_min <- cv_lasso %>% predict(newx = x_vars, s="lambda.min", type="response", seed=SEED)
predrisk_lasso_min_vec = as.vector(predrisk_lasso_min)
mi_sum$predrisk_lasso_min <- predrisk_lasso_min_vec

predvals_lasso_min <- cv_lasso %>% predict(newx = x_vars, s="lambda.min", seed=SEED)
predvals_lasso_min_vec = as.vector(predvals_lasso_min)
mi_sum$predvals_lasso_min <- predvals_lasso_min_vec

# Predicted outcomes lasso min
predout_lasso_min <-  ifelse(predrisk_lasso_min > 0.5, 1, 0)

Performance

The lambda min model had an accuracy (defined as the concordance between predicted outcome - predicted risk >0.5 = outcome, predicted risk <=0.5 = no outcome vs. the true values of the outcome) of:

mean(predout_lasso_min == outcome_vector)
[1] 0.798

Brier score, calibration slope, and calibration intercept of the model:

lasso_min_perf <- ev_glm(method="original", lp="predvals_lasso_min", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")

brier <- lasso_min_perf["Brier", "apparent"]
slope <- lasso_min_perf["Slope", "apparent"]
intercept <- lasso_min_perf["Intercept", "apparent"]

perf_lasso_min <- rbind("Brier score" = brier, "Calibration slope" = slope, "Calibration intercept" = intercept)
perf_lasso_min
                       [,1]
Brier score           0.137
Calibration slope     1.021
Calibration intercept 0.024

CI for apparent c-statistic

c_lasso_min <- c_ci(predvals_lasso_min_vec, outcome_vector)
c_lasso_min
     c_stat    LB    UB
[1,]  0.778 0.743 0.811

Calibration plot

And here is the calibration curve and c-statistic:

val.prob(predrisk_lasso_min, outcome_vector, cex=.5)
      Dxy   C (ROC)        R2         D  D:Chi-sq       D:p         U  U:Chi-sq 
  0.55646   0.77823   0.22749   0.15489 147.22043        NA  -0.00207   0.04736 
      U:p         Q     Brier Intercept     Slope      Emax       E90      Eavg 
  0.97660   0.15696   0.13662   0.02371   1.02056   0.16528   0.03948   0.02230 
      S:z       S:p 
  0.17367   0.86213 

ROC curve

plot(ev_glm(method="original", lp="predvals_lasso_min", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))

Lambda 1se

coef_lasso_1se <- coef(cv_lasso, s="lambda.1se")

# Predicted value sand risks
predrisk_lasso_1se <- cv_lasso %>% predict(newx = x_vars, s="lambda.1se", type="response", seed=SEED)
predrisk_lasso_1se_vec = as.vector(predrisk_lasso_1se)
mi_sum$predrisk_lasso_1se <- predrisk_lasso_1se_vec


predvals_lasso_1se <- cv_lasso %>% predict(newx = x_vars, s="lambda.1se", seed=SEED)
predvals_lasso_1se_vec = as.vector(predvals_lasso_1se)
mi_sum$predvals_lasso_1se <- predvals_lasso_1se_vec

# Predicted outcome
predout_lasso_1se <-  ifelse(predrisk_lasso_1se > 0.5, 1, 0)

Performance

The lambda min model had an accuracy (defined as the concordance between predicted outcome - predicted risk >0.5 = outcome, predicted risk <=0.5 = no outcome vs. the true values of the outcome) of:

mean(predout_lasso_1se == outcome_vector)
[1] 0.8

Brier score, calibration slope, and calibration intercept of the full model:

lasso_1se_perf <- ev_glm(method="original", lp="predvals_lasso_1se", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")

brier <- lasso_1se_perf["Brier", "apparent"]
slope <- lasso_1se_perf["Slope", "apparent"]
intercept <- lasso_1se_perf["Intercept", "apparent"]

perf_lasso_1se <- rbind("Brier score" = brier, "Calibration slope" = slope, "Calibration intercept" = intercept)
perf_lasso_1se
                       [,1]
Brier score           0.145
Calibration slope     1.786
Calibration intercept 1.002

CI for apparent c-statistic

c_lasso_1se <- c_ci(predvals_lasso_1se_vec, outcome_vector)
c_lasso_1se
     c_stat    LB    UB
[1,]  0.744 0.705 0.782

Calibration plot

And here is the calibration curve and c-statistic:

val.prob(predrisk_lasso_1se, outcome_vector, cex=.5)
      Dxy   C (ROC)        R2         D  D:Chi-sq       D:p         U  U:Chi-sq 
 4.88e-01  7.44e-01  1.46e-01  9.63e-02  9.19e+01        NA  1.98e-02  2.07e+01 
      U:p         Q     Brier Intercept     Slope      Emax       E90      Eavg 
 3.27e-05  7.65e-02  1.45e-01  1.00e+00  1.79e+00  1.53e-01  1.02e-01  5.55e-02 
      S:z       S:p 
-1.28e+00  2.00e-01 

ROC curve

plot(ev_glm(method="original", lp="predvals_lasso_1se", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))

Model approximation

The data below shows the steps of removal of variables from the full model. This is the first step to model approximation. Model all variables against the predicted values from the full model, and then can estimate how well the set of variables predicted the predicted values from the full model.

approx_full_formula <- as.formula(paste0("predvals_full_model ~", (paste(model_vars[2:length_model_vars], collapse = "+"))))

approx_full_model <- ols(approx_full_formula, sigma=1, data=mi_sum)

fastbw(approx_full_model, aics=10000) 

 Deleted         Chi-Sq d.f. P      Residual d.f. P      AIC     R2   
 xray_cavit        0.08 1    0.7770    0.08   1   0.7770   -1.92 1.000
 alcoholhx         2.86 2    0.2388    2.94   3   0.4003   -3.06 0.997
 dishx_any_minus   5.37 1    0.0204    8.32   4   0.0806    0.32 0.993
 evertb            7.09 1    0.0077   15.41   5   0.0087    5.41 0.986
 bmi              18.41 1    0.0000   33.82   6   0.0000   21.82 0.970
 smear_pos        18.50 1    0.0000   52.32   7   0.0000   38.32 0.953
 non_white        26.00 1    0.0000   78.31   8   0.0000   62.31 0.930
 female           30.30 1    0.0000  108.61   9   0.0000   90.61 0.903
 educ_years       52.73 1    0.0000  161.34  10   0.0000  141.34 0.856
 hiv              61.26 1    0.0000  222.59  11   0.0000  200.59 0.801
 diabetes_yn      72.61 1    0.0000  295.21  12   0.0000  271.21 0.736
 age_group        66.46 4    0.0000  361.67  16   0.0000  329.67 0.676
 smokhx           95.49 2    0.0000  457.16  18   0.0000  421.16 0.591
 lab_hgb         250.25 1    0.0000  707.41  19   0.0000  669.41 0.367
 drughx          410.34 2    0.0000 1117.75  21   0.0000 1075.75 0.000

Approximate Estimates after Deleting Factors

       Coef    S.E. Wald Z P
[1,] -1.671 0.03255 -51.34 0

Factors in Final Model

None

We see that the variables through age can predict the full moel with an R2 of 0.986.

The code below confirms how well the boot50 model approximates the predicted values from the full model.

approx_boot_formula = as.formula(paste0("predvals_full_model ~ ",  (paste(boot_vars, collapse = "+"))))

approx_boot_model <- ols(approx_boot_formula, data=mi_sum, x=TRUE, y=TRUE)

coef_approx_model = coef(approx_boot_model)

approx_boot_model$stats["R2"]
   R2 
0.986 

The mean absolute error between predicted values from the approximate model and predicted values from the full model is:

predvals_approx_mod <- predict(approx_boot_model)
mi_sum$predvals_approx_mod <- predvals_approx_mod

# Absolute error between approximated model predicted values and full model predicted values
abs_err <- mean(abs(predvals_approx_mod - predvals_full_model))
abs_err
[1] 0.105

Performance

Brier score, calibration slope, and calibration intercept of the model:

approx_perf <- ev_glm(method="original", lp="predvals_approx_mod", outcome="unsuccessful", data=mi_sum, samples=200, return="performance")

brier <- approx_perf["Brier", "apparent"]
slope <- approx_perf["Slope", "apparent"]
intercept <- approx_perf["Intercept", "apparent"]

perf_approx_model <- rbind("Brier score" = brier, "Calibration slope" = slope, "Calibration intercept" = intercept)
perf_approx_model
                        [,1]
Brier score            0.137
Calibration slope      0.994
Calibration intercept -0.004

CI for apparent c-statistic:

c_approx_model <- c_ci(predvals_approx_mod, outcome_vector)
c_approx_model
     c_stat    LB    UB
[1,]  0.778 0.742 0.813

Calibration plot

cal_plot_solo(mi_sum, predvals_approx_mod, outcome_vector)

ROC curve

plot(ev_glm(method="original", lp="predvals_approx_mod", outcome="unsuccessful", data=mi_sum, samples=1, return="roc"))

Model comparison

Coefficients

# Extract names of variables retained in selected model
selected_vars_int <- names(coef_selected_model[which(coef_selected_model!=0)])
selected_vars_int <- gsub("Former|Never|Current|25-35|35-45|45-55|55+", "", selected_vars_int)
selected_vars_int <- unique(selected_vars_int)
selected_vars <- selected_vars_int[-1]

selected_form <- as.formula(paste0("unsuccessful ~", (paste(selected_vars, collapse = "+"))))

selected_model <- lrm(selected_form, data = mi_sum, x = T, y = T)

predvals_selected_model <- predict(selected_model)

# Model performance selected model - don't need to output because its the same as the boot50 model, but want to store for table
c_selected_model <- c_ci(predvals_selected_model, outcome_vector)

selected_model_val <- validate(selected_model)
perf_selected_model <- save_val_results(selected_model_val)
# Extract coefficients from each modeling approach and compare

coef_full <- data.frame(coef_full_model) %>% rownames_to_column("variable") %>% rename(full_model = coef_full_model) %>% 
  mutate(variable = gsub("[(]|[)]|=", "", variable))

coef_lasso_min <- data.frame(as.array(coef_lasso_min)) %>%  rownames_to_column("variable") %>%  rename(lasso_min = X1) %>% 
  mutate(variable = gsub("[(]|[)]|=|rcsage, 4", "", variable),
         variable = gsub("rcsage, 4", "", variable))

coef_lasso_1se <- data.frame(as.array(coef_lasso_1se)) %>%  rownames_to_column("variable") %>%  rename(lasso_1se = X1) %>% 
  mutate(variable = gsub("[(]|[)]|=", "", variable),
         variable = gsub("rcsage, 4", "", variable))

coef_selected <- data.frame(coef_selected_model)  %>% rownames_to_column("variable") %>% rename(selected_model = coef_selected_model) %>% 
  mutate(variable = gsub("[(]|[)]|=", "", variable),
         variable = gsub("rcsage, 4", "", variable))

coef_median <- data.frame(coef_boot_median)  %>% rownames_to_column("variable") %>% rename(boot_median = coef_boot_median) %>%
  mutate(variable = gsub("[(]|[)]|=", "", variable),
         variable = gsub("rcsage, 4", "", variable))

coef_boot50_model <- data.frame(coef_boot50_vars)  %>% rownames_to_column("variable") %>% rename(boot50_model = coef_boot50_vars) %>% 
  mutate(variable = gsub("[(]|[)]|=", "", variable))

coef_shrink <- data.frame(coef_shrink_model)  %>% rownames_to_column("variable") %>% rename(coef_shrink = coef_shrink_model) %>% 
  mutate(variable = gsub("[(]|[)]|=", "", variable))

coef_approx <- data.frame(coef_approx_model)  %>% rownames_to_column("variable") %>% rename(approx_model = coef_approx_model)  %>% 
  mutate(variable = gsub("[(]|[)]|=", "", variable))


var_order <- coef_median %>% pull(variable)
  
coefficients_table <- Reduce(function(x,y) merge(x, y, by = "variable", all.x = TRUE, all.y = TRUE),
       list(coef_full, coef_selected, coef_median, coef_boot50_model, coef_shrink, coef_approx, coef_lasso_min, coef_lasso_1se)) %>% 
  mutate_if(is.numeric, ~replace(., is.na(.), 0)) %>% 
  arrange(factor(variable, var_order))

coefficients_table

Performance measures

# -- C statistics ---
c_stat_list <- as.data.frame(rbind(c_full_model, c_selected_model, c_boot_median, c_boot50_model, c_shrink, c_approx_model, c_lasso_min, c_lasso_1se))

rownames(c_stat_list) <- c("full_model", "selected_model" , "boot_median", "boot50_model", "shrink", "approx", "lambda_min", "lambda_1se")

c_stats <- c_stat_list %>% 
  rownames_to_column("model") %>% 
  mutate_if(is.numeric, round, 2) %>% 
  mutate(c_ci = paste(c_stat, " (", LB, ", ", UB, ")", sep="")) %>% 
  select(model, c_ci)

#--- Brier score, slope, and intercept ---
performance_list <- as.data.frame(t(cbind(perf_full_model, perf_selected_model, perf_boot_median, perf_boot50_model, perf_shrink, perf_approx_model, perf_lasso_min, perf_lasso_1se)))

rownames(performance_list) <- c("full_model", "selected_model", "boot_median", "boot50_model", "shrink", "approx", "lambda_min", "lambda_1se")

performance <- performance_list %>%
  rownames_to_column("model") %>% 
  rename(brier_score = "Brier score",
         cal_slope = "Calibration slope",
         cal_int = "Calibration intercept")


#---- Combine data ---
models_performance <- performance %>% 
  full_join(c_stats, by="model") 

models_performance
# Transpose
t_model_performance <- as.data.frame(t(models_performance)) %>% rownames_to_column("stat")

Main results

Classification table

Based on coefficients from the boot50 model, here is a classification table broken down by deciles. Each row denotes the classification values for people above that decile are considered as “unsuccessful” (positive) outcome vs. “successful” (negative) outcome.

# Positive = unsuccessful 
# Negative = successful 

n_total = nrow(mi_sum)
all_positive = nrow(filter(mi_sum, unsuccessful==1))
all_negative = nrow(filter(mi_sum, unsuccessful==0))

classification_table <- mi_sum %>% 
  mutate(risk_group = ntile(predvals_boot50, 10)) %>% 
  group_by(risk_group) %>% 
  summarize(
    n_group = n(), 
    outcome = sum(unsuccessful),
    no_outcome = n_group-outcome,
    obs_prob_outcome = mean(unsuccessful),
    pred_prob_outcome = mean(predvals_boot50),
    min_pred_val = min(predvals_boot50),
    max_pred_val = max(predvals_boot50)) %>% 
  mutate(n_below_cut_off = cumsum(n_group),
         perc_below_cut_off = n_below_cut_off/n_total,
         cm_outcome = cumsum(outcome),
         cm_no_outcome =cumsum(no_outcome),
         tp = all_positive - cm_outcome,
         tn = cm_no_outcome,
         fp = all_negative - cm_no_outcome,
         fn = cm_outcome,
         sensitivity = tp/(tp+fn),
         specificity = tn/(tn+fp),
         ppv = tp/(tp+fp),
         npv = tn/(tn+fn)) %>% 
  select(risk_group, max_pred_val, n_below_cut_off, perc_below_cut_off, cm_outcome, cm_no_outcome, tp, tn, fp, fn, sensitivity, specificity, ppv, npv)
    

classification_table

Nomogram

# Set data distribution
dd <- datadist (mi_sum)
options(datadist="dd")

boot_model$coefficients
      Intercept         lab_hgb             hiv   drughx=Former  drughx=Current 
         0.9089         -0.1709          0.7802          0.4017          1.1243 
    diabetes_yn age_group=25-35 age_group=35-45 age_group=45-55   age_group=55+ 
         0.6902         -0.4444         -0.6439         -1.0635         -0.4327 
     educ_years   smokhx=Former  smokhx=Current          female             bmi 
        -0.0547          0.5904          0.4841         -0.3564         -0.0434 
      smear_pos       non_white 
         0.3702          0.3924 
boot_model <- Newlabels(boot_model, c("Hemoglobin (g/dL)", "HIV", "Drug use", "Diabetes", "Age group", "Education (years)", "Tobacco use", "Female sex", "BMI",  "Smear positive", "Non-white"))

nom <- nomogram(boot_model, 
                fun=plogis,
                fun.at=c(.001,.01,.05,seq(.1,.9,by=.1),.95,.99,.999),
                funlabel="Risk of unsuccessful outcome",
                vnames="labels")

plot(nom)

Risk plot

pred_vals <- predvals_boot50

graph_data <- mi_sum %>%
  mutate(pred_vals = ifelse(pred_vals < -4, -4, pred_vals), # Censor extreme observations
         pred_vals = ifelse(pred_vals > 1.5, 1.5, pred_vals), # Censor extreme observations
    risk_group = cut(pred_vals,
                       breaks = 15)) %>% 
  group_by(risk_group) %>% 
  summarize(outcome_prob = mean(unsuccessful),
            count = n(),
            min_pred_val = min(pred_vals),
            max_pred_val = max(pred_vals),
            mean_pred_val = mean(pred_vals))

scaleFactor <- max(graph_data$count) / max(graph_data$outcome_prob)
text_size = 15

graph <- graph_data %>% 
  ggplot(aes(x=mean_pred_val, y=count)) +
  geom_bar(stat="identity") +
  geom_line(aes(y=outcome_prob*scaleFactor), color="tomato1", size=1) +
  scale_y_continuous("Participant count", expand=c(0,0), sec.axis = sec_axis(~./scaleFactor, name = "Outcome probability (%)")) +
  labs(x="Predicted value") +
  theme(
    axis.line.y.right = element_line(color = "tomato1"), 
    axis.ticks.y.right = element_line(color = "tomato1"),
    axis.title.y.right=element_text(color="tomato1", size=text_size),
    axis.title.y.left=element_text(size=text_size),
    axis.title.x = element_text(size=text_size),
    axis.text.y.right=element_text(color="tomato1", size=12),
    axis.text.y.left = element_text(size=12),
    axis.text.x = element_text(size=12),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    panel.background = element_blank(), 
    axis.line = element_line(colour = "black"))
  
graph

#----  Save all results to Excel -----
coefficients <- coefficients_table
performance <- t_model_performance

# Create a blank workbook
library(openxlsx)
results <- createWorkbook()

# Add some sheets to the workbook
addWorksheet(results, "Model-Selection")
addWorksheet(results, "Coefficients")
addWorksheet(results, "Performance")
addWorksheet(results, "Classification-Table")

# Write the data to the sheets
writeData(results, sheet = "Model-Selection", x = overview)
writeData(results, sheet = "Coefficients", x = coefficients)
writeData(results, sheet = "Performance", x = performance)
writeData(results, sheet = "Classification-Table", x=classification_table)

# Export the file
saveWorkbook(results, here("Manuscript", "tables", "main_analyses_results.xlsx"), overwrite=TRUE)

Check

Compare results from the 500 bootstraps above to 100 bootstraps from the model_development function that will be used for sensitivity analyses.

set.seed(SEED)

# Set data distribution
dd <- suppressWarnings(datadist(mi_sum))
options(datadist="dd")

save_results_all <- model_development(data=mi_sum, 
                                  outcome=mi_sum$unsuccessful, 
                                  model_formula=model_formula, 
                                  approx_full_formula=approx_full_formula, 
                                  bootstraps=500, 
                                  seed=SEED)


# Model selection
save_results_all$raw_overview
# Model coefficients
save_results_all$coefficients_table
# Model performance
save_results_all$models_performance
# C-statistics
save_results_all$c_statistics
$full_model
                 2.5% 97.5%
orig      0.779 0.742 0.811
corrected 0.746 0.711 0.780

$boot50_model
                 2.5% 97.5%
orig      0.778 0.743 0.812
corrected 0.754 0.719 0.789
# Boot50 calibration
save_results_all$boot50_calibration

Added value

Setup

# Merge imputed data from dissertation analysis to hiv_vars from dissertation data 
hiv_vars <- Cs(art_experienced, cd4_count, viral_load)
hiv_data <- data %>% select(subjid, all_of(hiv_vars))

# Add subjid to mi_sum
analysis_data <- mi_sum %>% 
  left_join(hiv_data, by="subjid") %>% 
   mutate(art_experienced = ifelse(hiv==1 & is.na(art_experienced), 0, art_experienced))  # Change never started ART to no prior ART 

# Create dataset with only PLWH for imputation 
hiv_analysis_data <- analysis_data %>% filter(hiv==1) 
 
# Create data frame to merge back imputed data
hiv_matches <- hiv_analysis_data %>% 
  select(subjid, id) %>% 
  rowid_to_column("hiv_id") 

# Variables used for imputation of HIV-related variables
hiv_impute <- Cs(diabetes_yn, lab_hgb, evertb, xray_cavit, smear_pos, age_group, female, bmi, dishx_any_minus, educ_years, alcoholhx, drughx, smokhx, non_white, cd4_count, viral_load, art_experienced)
  
# Impute
# 8 values of CD4 and 14 values of viral load were imputed
mi_hiv <- mice(hiv_analysis_data[,hiv_impute], m=10 , maxit=5, seed=352, printFlag = FALSE)

mi_long_hiv <- complete(mi_hiv, "long") %>% 
  rename(id=.id) 
# Keep only variables that were imputed and add subjids
mi_sum_hiv <- mi_long_hiv %>% 
  rename(hiv_id = id) %>% 
  select(hiv_id, cd4_count, viral_load) %>% 
  group_by(hiv_id) %>% 
  summarise(across(where(is.numeric), ~ median(.x, na.rm = TRUE))) %>% 
  left_join(hiv_matches, by="hiv_id")

# Add HIV summarized data to analysis dataset
data_to_analyze <- analysis_data %>% 
  select(!cd4_count & !viral_load) %>% 
  full_join(mi_sum_hiv, by=Cs(subjid, id)) 
# Add added values variables 
# Build interactions so HIV-negative are not dropped from analysis

# Add to data with missing data from some HIV variables
data_with_missing <- analysis_data %>% 
  mutate(
    baseline_art_all = fct_case_when(
      hiv==0 | is.na(hiv) ~ "HIV negative",
      art_experienced==1 ~ "Prior ART",
      art_experienced==0 ~ "Not on ART"),
    cd4_lt200_all = fct_case_when( # AIDS = CD4 < 200
       hiv==0 | is.na(hiv) ~ "HIV negative",
       cd4_count>=200 ~ "CD4 >=200",
       cd4_count <200 ~ "CD4 <200"),
    vl_gt200_all = fct_case_when( # Viral suppression = viral load < 200
       hiv==0 | is.na(hiv) ~ "HIV negative",
       viral_load <200 ~ "VL <200",
       viral_load >=200 ~ "VL >=200"))

# Add to data with HIV variables imputed
data_to_analyze <- data_to_analyze %>% 
  mutate(
    baseline_art_all = fct_case_when(
      hiv==0 | is.na(hiv) ~ "HIV negative",
      art_experienced==1 ~ "Prior ART",
      art_experienced==0 ~ "Not on ART"),
    cd4_lt200_all = fct_case_when( # AIDS = CD4 < 200
       hiv==0 | is.na(hiv) ~ "HIV negative",
       cd4_count>=200 ~ "CD4 >=200",
       cd4_count <200 ~ "CD4 <200"),
    vl_gt200_all = fct_case_when( # Viral suppression = viral load < 200
       hiv==0 | is.na(hiv) ~ "HIV negative",
       viral_load <200 ~ "VL <200",
       viral_load >=200 ~ "VL >=200"))

Comparison of the distribution of HIV related variables before and after imputation

data_with_missing %>% 
  filter(hiv==1) %>% 
  select(all_of(hiv_vars), cd4_lt200_all, vl_gt200_all, unsuccessful) %>% 
  tbl_summary(by="unsuccessful") %>% 
  add_overall()
Characteristic Overall, N = 182 0, N = 1171 1, N = 651
art_experienced 71 (39%) 53 (45%) 18 (28%)
cd4_count 126 (54, 274) 132 (52, 266) 120 (56, 272)
Unknown 7 2 5
viral_load 32183 (309, 205226) 30532 (212, 216615) 36892 (820, 158092)
Unknown 13 7 6
cd4_lt200_all
HIV negative 0 (0%) 0 (0%) 0 (0%)
CD4 >=200 59 (34%) 39 (34%) 20 (33%)
CD4 <200 116 (66%) 76 (66%) 40 (67%)
Unknown 7 2 5
vl_gt200_all
HIV negative 0 (0%) 0 (0%) 0 (0%)
VL <200 39 (23%) 28 (25%) 11 (19%)
VL >=200 130 (77%) 82 (75%) 48 (81%)
Unknown 13 7 6

1 Statistics presented: n (%); median (IQR)

data_to_analyze %>% 
  filter(hiv==1) %>% 
  select(all_of(hiv_vars), cd4_lt200_all, vl_gt200_all, unsuccessful) %>% 
  tbl_summary(by='unsuccessful') %>% 
  add_overall()
Characteristic Overall, N = 182 0, N = 1171 1, N = 651
art_experienced 71 (39%) 53 (45%) 18 (28%)
cd4_count 134 (56, 277) 135 (53, 280) 129 (61, 267)
viral_load 28774 (313, 176050) 28666 (193, 191751) 36892 (731, 148380)
cd4_lt200_all
HIV negative 0 (0%) 0 (0%) 0 (0%)
CD4 >=200 63 (35%) 41 (35%) 22 (34%)
CD4 <200 119 (65%) 76 (65%) 43 (66%)
vl_gt200_all
HIV negative 0 (0%) 0 (0%) 0 (0%)
VL <200 42 (23%) 30 (26%) 12 (18%)
VL >=200 140 (77%) 87 (74%) 53 (82%)

1 Statistics presented: n (%); median (IQR)

# Full model formula
full_model_lrm <- lrm(model_formula, data=mi_sum, x=TRUE, y=TRUE)
full_model_glm <- glm(model_formula, data = mi_sum, family="binomial", x = T, y = T)

# Boot50 model formula
boot50_model_lrm <-  lrm(boot_form, data=mi_sum, x=TRUE, y=TRUE)
boot50_model_glm <-  glm(boot_form, data = mi_sum, family="binomial", x = T, y = T)

Analysis - imputed data

# ---- Base model ----
base_hiv <- lrm(boot_form, data=data_to_analyze)

# ---- Outcome -----
y = data_to_analyze$unsuccessful

ART

add_art <-  lrm(unsuccessful ~ lab_hgb + drughx + educ_years + diabetes_yn + smokhx + bmi + non_white + female + smear_pos + age_group + baseline_art_all, data=data_to_analyze) 

# Added value function
add_art_results <- added_value(base_hiv, add_art, y)

Likelihood ratio test:

add_art_results$lrtest

Model 1: unsuccessful ~ lab_hgb + hiv + drughx + diabetes_yn + age_group + 
    age_group + +educ_years + smokhx + female + bmi + smear_pos + 
    non_white
Model 2: unsuccessful ~ lab_hgb + drughx + educ_years + diabetes_yn + 
    smokhx + bmi + non_white + female + smear_pos + age_group + 
    baseline_art_all

L.R. Chisq       d.f.          P 
    4.7312     1.0000     0.0296 

NRI and IDI:

add_art_results$nri_idi
        est       lb     ub
nri 0.21629  0.05990 0.3727
idi 0.00574 -0.00079 0.0123

Added value metrics recommended by Frank Harrell:

add_art_results$table

CD4

Likelihood ratio test:

add_cd4 <-  lrm(unsuccessful ~ lab_hgb + drughx + educ_years + diabetes_yn + smokhx + bmi + non_white + female + smear_pos + age_group + cd4_lt200_all, data=data_to_analyze) 

add_cd4_results <- added_value(base_hiv, add_cd4, y)

The likelihood ratio test is:

add_cd4_results$lrtest

Model 1: unsuccessful ~ lab_hgb + hiv + drughx + diabetes_yn + age_group + 
    age_group + +educ_years + smokhx + female + bmi + smear_pos + 
    non_white
Model 2: unsuccessful ~ lab_hgb + drughx + educ_years + diabetes_yn + 
    smokhx + bmi + non_white + female + smear_pos + age_group + 
    cd4_lt200_all

L.R. Chisq       d.f.          P 
    0.0126     1.0000     0.9106 

NRI and IDI:

add_cd4_results$nri_idi
          est        lb       ub
nri -0.123944 -0.281256 0.033368
idi  0.000065 -0.000283 0.000413

Added value table:

add_cd4_results$table

VL

add_vl <-  lrm(unsuccessful ~ lab_hgb + drughx + educ_years + diabetes_yn + smokhx + bmi + non_white + female + smear_pos + age_group + vl_gt200_all, data=data_to_analyze) 

add_vl_results <- added_value(base_hiv, add_vl, y)

Likelihood ratio test:

add_vl_results$lrtest

Model 1: unsuccessful ~ lab_hgb + hiv + drughx + diabetes_yn + age_group + 
    age_group + +educ_years + smokhx + female + bmi + smear_pos + 
    non_white
Model 2: unsuccessful ~ lab_hgb + drughx + educ_years + diabetes_yn + 
    smokhx + bmi + non_white + female + smear_pos + age_group + 
    vl_gt200_all

L.R. Chisq       d.f.          P 
     0.799      1.000      0.371 

NRI and IDI:

add_vl_results$nri_idi
         est       lb      ub
nri 0.117714 -0.03798 0.27341
idi 0.000871 -0.00168 0.00343

Added value table:

add_vl_results$table

Analysis - CC

Complete case analysis No one was missing ART, so I will only repeat for CD4 count and viral load

CD4

# --- Non missing data ---
cc_analaysis <- data_with_missing %>% 
  filter(!is.na(cd4_lt200_all))


# ---- Base model ----
base_hiv <- lrm(boot_form, data=cc_analaysis)

# ---- Outcome -----
y = cc_analaysis$unsuccessful
add_cd4 <-  lrm(unsuccessful ~ lab_hgb + drughx + educ_years + diabetes_yn + smokhx + bmi + non_white + female + smear_pos + age_group + cd4_lt200_all, data=cc_analaysis) 

add_cd4_results <- added_value(base_hiv, add_cd4, y)

Likelihood ratio test:

add_cd4_results$lrtest

Model 1: unsuccessful ~ lab_hgb + hiv + drughx + diabetes_yn + age_group + 
    age_group + +educ_years + smokhx + female + bmi + smear_pos + 
    non_white
Model 2: unsuccessful ~ lab_hgb + drughx + educ_years + diabetes_yn + 
    smokhx + bmi + non_white + female + smear_pos + age_group + 
    cd4_lt200_all

L.R. Chisq       d.f.          P 
     0.125      1.000      0.724 

NRI and IDI:

add_cd4_results$nri_idi
          est        lb      ub
nri -0.106696 -0.265494 0.05210
idi  0.000315 -0.000793 0.00142

Added value table:

add_cd4_results$table

VL

# --- Non missing data ---
cc_analaysis <- data_with_missing %>% 
  filter(!is.na(vl_gt200_all))


# ---- Base model ----
base_hiv <- lrm(boot_form, data=cc_analaysis)

# ---- Outcome -----
y = cc_analaysis$unsuccessful
add_vl <-  lrm(unsuccessful ~ lab_hgb + drughx + educ_years + diabetes_yn + smokhx + bmi + non_white + female + smear_pos + age_group + vl_gt200_all, data=cc_analaysis) 

add_vl_results <- added_value(base_hiv, add_vl, y)

Likelihood ratio test:

add_vl_results$lrtest

Model 1: unsuccessful ~ lab_hgb + hiv + drughx + diabetes_yn + age_group + 
    age_group + +educ_years + smokhx + female + bmi + smear_pos + 
    non_white
Model 2: unsuccessful ~ lab_hgb + drughx + educ_years + diabetes_yn + 
    smokhx + bmi + non_white + female + smear_pos + age_group + 
    vl_gt200_all

L.R. Chisq       d.f.          P 
     0.771      1.000      0.380 

NRI and IDI:

add_vl_results$nri_idi
         est      lb      ub
nri 0.135918 -0.0214 0.29322
idi 0.000777 -0.0018 0.00335

Added value table:

add_vl_results$table
sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] openxlsx_4.1.5          ResourceSelection_0.3-5 pROC_1.16.2            
 [4] hrbrthemes_0.8.6        cowplot_1.0.0           glmnet_4.0-2           
 [7] Matrix_1.2-18           rms_6.0-1               SparseM_1.78           
[10] mice_3.11.0             rpart_4.1-15            gtsummary_1.3.2        
[13] gmodels_2.18.1          skimr_2.1.2             here_0.1               
[16] forcats_0.5.0           stringr_1.4.0           dplyr_1.0.2            
[19] purrr_0.3.4             readr_1.4.0             tidyr_1.1.2            
[22] tibble_3.0.4            tidyverse_1.3.0         kableExtra_1.1.0       
[25] Hmisc_4.4-1             ggplot2_3.3.2           Formula_1.2-4          
[28] survival_3.1-12         lattice_0.20-41        

loaded via a namespace (and not attached):
  [1] TH.data_1.0-10      colorspace_2.0-0    ellipsis_0.3.1     
  [4] rprojroot_2.0.2     htmlTable_2.1.0     base64enc_0.1-3    
  [7] fs_1.5.0            rstudioapi_0.13     farver_2.0.3       
 [10] MatrixModels_0.4-1  fansi_0.4.1         mvtnorm_1.1-1      
 [13] lubridate_1.7.9     xml2_1.3.2          codetools_0.2-16   
 [16] splines_4.0.2       extrafont_0.17      knitr_1.30         
 [19] jsonlite_1.7.1      gt_0.2.1            Rttf2pt1_1.3.8     
 [22] broom_0.7.0         cluster_2.1.0       dbplyr_1.4.4       
 [25] png_0.1-7           compiler_4.0.2      httr_1.4.2         
 [28] backports_1.2.0     assertthat_0.2.1    cli_2.1.0          
 [31] htmltools_0.5.0     quantreg_5.75       tools_4.0.2        
 [34] gtable_0.3.0        glue_1.4.2          Rcpp_1.0.5         
 [37] cellranger_1.1.0    vctrs_0.3.4         gdata_2.18.0       
 [40] nlme_3.1-148        extrafontdb_1.0     conquer_1.0.2      
 [43] iterators_1.0.12    xfun_0.19           rvest_0.3.6        
 [46] lifecycle_0.2.0     gtools_3.8.2        polspline_1.1.19   
 [49] MASS_7.3-51.6       zoo_1.8-8           scales_1.1.1       
 [52] hms_0.5.3           sandwich_3.0-0      RColorBrewer_1.1-2 
 [55] yaml_2.2.1          gridExtra_2.3       sass_0.2.0         
 [58] gdtools_0.2.2       latticeExtra_0.6-29 stringi_1.5.3      
 [61] highr_0.8           foreach_1.5.0       checkmate_2.0.0    
 [64] zip_2.1.0           cpp11_0.2.4         shape_1.4.4        
 [67] repr_1.1.0          commonmark_1.7      systemfonts_0.3.1  
 [70] rlang_0.4.8         pkgconfig_2.0.3     matrixStats_0.57.0 
 [73] evaluate_0.14       labeling_0.4.2      htmlwidgets_1.5.2  
 [76] tidyselect_1.1.0    plyr_1.8.6          magrittr_1.5       
 [79] R6_2.5.0            generics_0.1.0      multcomp_1.4-15    
 [82] DBI_1.1.0           mgcv_1.8-31         pillar_1.4.6       
 [85] haven_2.3.1         foreign_0.8-80      withr_2.3.0        
 [88] nnet_7.3-14         modelr_0.1.8        crayon_1.3.4       
 [91] rmarkdown_2.5       jpeg_0.1-8.1        grid_4.0.2         
 [94] readxl_1.3.1        data.table_1.13.2   blob_1.2.1         
 [97] reprex_0.3.0        digest_0.6.27       webshot_0.5.2      
[100] munsell_0.5.0       viridisLite_0.3.0